"""
@author Luke Voinov, 23 Sep. 2025
Much of the code was adapted from Eric Larson's github, particularly the notebook '03. DataVisualization.ipynb'
https://github.com/eclarson/MachineLearningNotebooks/tree/master
I will denote the notebook '04. Dimension Reduction and Images.ipynb' with (1).
As you will see, the dataset is about chess positions on various board styles:
"Chess Positions". https://www.kaggle.com/datasets/koryakinp/chess-positions
The dataset contains too many images to deal with locally so we reduced the size to only 9.9k images.
"""
# This section of code is borrowed from (1) to get the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
print('Seaborn:', sns. __version__)
print('Pandas:', pd.__version__)
print('Numpy:',np.__version__)
# Python: 3.12.3
Seaborn: 0.13.2 Pandas: 2.2.2 Numpy: 1.26.4
Is this dataset valid for the assignment?¶
1. The data includes at least 1000 images
Yes, the dataset includes 100k images (reduced to 9.9k because Luke forgot to put 100 more images into the test folder)
2. The size of the images should be larger than 20x20 pixels
Yes, the pixels are 400x400
3. The dataset cannot be MNIST or Fashion MNIST
The dataset is neither.
4. The dataset should have a well defined prediction task (i.e., a label for each image)
The dataset contains a tag saying this dataset was created for 'mutilclass classification'. It has a training set of 80k images and test set of 20k images.
"Labels are in a filename in Forsyth–Edwards Notation format, but with dashes instead of slashes."
Business Understanding¶
Give an overview of the dataset. Describe the purpose of the data set you selected (i.e., why was this data collected in the first place?). What is the prediction task for your dataset and which third parties would be interested in the results? Why is this data important? Once you begin modeling, how well would your prediction algorithm need to perform to be considered useful to the identified third parties? Be specific and use your own words to describe the aspects of the data.
The person who created this dataset says that "[t]he goal of the project is to build a model able to generate FEN description based on a schematic image of a chess board." Another use for this data that's not associated with the original use is predicting what the next best move should be.
The prediction task for our dataset is classification. We can use the images as input to classify a each piece and square, thus being able to generate notation for the board, or analyzing the board and correlate the current position with a probability of winning.
# load the dataset
# this code is borrowed from Gerry P, https://stackoverflow.com/questions/72076962/how-to-load-all-the-image-paths-from-a-directory-into-a-pandas-dataframe-column
import os
import pandas as pd
source_dir = r"C:\Users\lukev\Downloads\Chess\all"
filepaths=[]
filelist=os.listdir(source_dir)
for f in filelist:
fpath = os.path.normpath(os.path.join(source_dir, f))
filepaths.append(fpath)
Fseries=pd.Series(filepaths, name='filepaths')
df=pd.concat([Fseries], axis=1)
df.info
<bound method DataFrame.info of filepaths 0 C:\Users\lukev\Downloads\Chess\all\r3K3-8-2P5-... 1 C:\Users\lukev\Downloads\Chess\all\R3k3-8-3b4-... 2 C:\Users\lukev\Downloads\Chess\all\R3k3-8-4P2P... 3 C:\Users\lukev\Downloads\Chess\all\R3k3-8-5n2-... 4 C:\Users\lukev\Downloads\Chess\all\r3K3-8-5Pk1... ... ... 1995 C:\Users\lukev\Downloads\Chess\all\rRnK1N2-4Nr... 1996 C:\Users\lukev\Downloads\Chess\all\rrR1Q3-4p3-... 1997 C:\Users\lukev\Downloads\Chess\all\Rrr3K1-3r4-... 1998 C:\Users\lukev\Downloads\Chess\all\RRr5-2r3b1-... 1999 C:\Users\lukev\Downloads\Chess\all\rRR5-8-1N2p... [2000 rows x 1 columns]>
# Code adapted from github Coilot to reduce the pixel size from 400x400 to 64x64
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import time
print(f"Processing {len(df)} chess images...")
start_time = time.time()
# Get image dimensions from first image
first_img = plt.imread(df.iloc[0]['filepaths'])
print(f"Original image size: {first_img.shape}")
# High-quality resolution with 2K images (224x224 for excellent detail)
target_size = (224, 224) # High quality - can see individual chess pieces clearly
print(f"Downsampling to: {target_size}")
# Limit to 2000 images for better quality/memory balance
n_samples = min(2000, len(df))
n_features = target_size[0] * target_size[1]
memory_estimate_gb = (n_samples * n_features * 4) / (1024**3)
df_subset = df.iloc[:n_samples].copy()
X = np.zeros((len(df_subset), n_features), dtype=np.float32)
# Process images in batches
batch_size = 100
for batch_start in range(0, len(df_subset), batch_size):
batch_end = min(batch_start + batch_size, len(df_subset))
for i in range(batch_start, batch_end):
img = plt.imread(df_subset.iloc[i]['filepaths'])
# Convert to grayscale if color image
if len(img.shape) == 3:
img = np.mean(img, axis=2)
# Resize image using PIL
img_pil = Image.fromarray((img * 255).astype(np.uint8) if img.max() <= 1 else img.astype(np.uint8))
img_resized = img_pil.resize(target_size, Image.Resampling.LANCZOS)
img_array = np.array(img_resized, dtype=np.float32) / 255.0
X[i] = img_array.flatten()
h, w = target_size # Set dimensions for later use
end_time = time.time()
print(f"\nProcessing completed in {end_time - start_time:.2f} seconds")
print(f"Dataset shape: {X.shape}")
Processing 2000 chess images... Original image size: (400, 400, 3) Downsampling to: (224, 224) Processing completed in 19.98 seconds Dataset shape: (2000, 50176)
# Dataset analysis for chess images
# Get specifics of the processed chess dataset
# X is already loaded from the previous cell
n_samples, n_features = X.shape
# Display first 3 processed images
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for i in range(3):
axes[i].imshow(X[i].reshape(h, w), cmap='gray')
axes[i].set_title(f'Chess Board {i+1} (Processed)')
axes[i].axis('off')
plt.tight_layout()
plt.show()
# Dataset statistics
print("Chess Dataset Statistics:")
print(f"n_samples: {n_samples}")
print(f"n_features: {n_features}")
print(f"Processed image size: {h} x {w}")
print(f"Original image size: 400 x 400")
Chess Dataset Statistics: n_samples: 2000 n_features: 50176 Processed image size: 224 x 224 Original image size: 400 x 400
Perform linear dimensionality reduction of the images using principal components analysis. Visualize the explained variance of each component. Analyze how many dimensions are required to adequately represent your image data. Explain your analysis and conclusion.
# manipulated from Sebastian Raschka Example (your book!)
# also from hi blog here: http://sebastianraschka.com/Articles/2015_pca_in_3_steps.html
from sklearn.decomposition import PCA
# this is a scree plot
def plot_explained_variance(pca):
import plotly
from plotly.graph_objs import Bar, Line
from plotly.graph_objs import Scatter, Layout
from plotly.graph_objs.scatter import Marker
from plotly.graph_objs.layout import XAxis, YAxis
plotly.offline.init_notebook_mode() # run at the start of every notebook
explained_var = pca.explained_variance_ratio_
cum_var_exp = np.cumsum(explained_var)
plotly.offline.iplot({
"data": [Bar(y=explained_var, name='individual explained variance'),
Scatter(y=cum_var_exp, name='cumulative explained variance')
],
"layout": Layout(xaxis=XAxis(title='Principal components'), yaxis=YAxis(title='Explained variance ratio'))
})
# Print exact values at component 100
if len(pca.explained_variance_ratio_) >= 10:
individual_var_10 = pca.explained_variance_ratio_[9] # 0-indexed
cumulative_var_10 = np.cumsum(pca.explained_variance_ratio_)[9]
print(f"\nValues at Component 10 (x=10):")
print(f"Individual explained variance: {individual_var_10:.6f}")
print(f"Cumulative explained variance: {cumulative_var_10:.6f}")
print(f"Percentage of total variance explained by first 100 components: {cumulative_var_10*100:.2f}%")
if len(pca.explained_variance_ratio_) >= 100:
individual_var_100 = pca.explained_variance_ratio_[99] # 0-indexed
cumulative_var_100 = np.cumsum(pca.explained_variance_ratio_)[99]
print(f"\nValues at Component 100 (x=100):")
print(f"Individual explained variance: {individual_var_100:.6f}")
print(f"Cumulative explained variance: {cumulative_var_100:.6f}")
print(f"Percentage of total variance explained by first 100 components: {cumulative_var_100*100:.2f}%")
if len(pca.explained_variance_ratio_) >= 500:
individual_var_500 = pca.explained_variance_ratio_[499] # 0-indexed
cumulative_var_500 = np.cumsum(pca.explained_variance_ratio_)[499]
print(f"\nValues at Component 500 (x=500):")
print(f"Individual explained variance: {individual_var_500:.6f}")
print(f"Cumulative explained variance: {cumulative_var_500:.6f}")
print(f"Percentage of total variance explained by first 500 components: {cumulative_var_500*100:.2f}%")
pca = PCA(n_components=5)
X_pca = pca.fit(X)
plot_explained_variance(pca)
plt.show()
# added the below code to show how much variance up to 500 features can explain
pca = PCA(n_components=500)
X_pca = pca.fit(X)
plot_explained_variance(pca)
Values at Component 10 (x=10): Individual explained variance: 0.004685 Cumulative explained variance: 0.703561 Percentage of total variance explained by first 100 components: 70.36% Values at Component 100 (x=100): Individual explained variance: 0.000396 Cumulative explained variance: 0.881878 Percentage of total variance explained by first 100 components: 88.19% Values at Component 500 (x=500): Individual explained variance: 0.000084 Cumulative explained variance: 0.951598 Percentage of total variance explained by first 500 components: 95.16%
# a helper plotting function
def plot_gallery(images, h, w, n_row=3, n_col=6):
"""Helper function to plot a gallery of portraits"""
plt.figure(figsize=(1.7 * n_col, 2.3 * n_row))
plt.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
for i in range(n_row * n_col):
plt.subplot(n_row, n_col, i + 1)
plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray)
plt.title(i, size=12)
plt.xticks(())
plt.yticks(())
plot_gallery(X, h, w) # defaults to showing a 3 by 6 subset of the faces
# lets do some PCA of the features and go from 1850 features to 20 features
from sklearn.decomposition import PCA
n_components = 300
print ("Extracting the top %d eigenfaces from %d faces" % (
n_components, X.shape[0]))
pca = PCA(n_components=n_components)
%time pca.fit(X.copy())
eigenfaces = pca.components_.reshape((n_components, h, w))
Extracting the top 300 eigenfaces from 2000 faces CPU times: total: 1min 59s Wall time: 13.1 s
plot_explained_variance(pca)
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[8], line 3 1 plot_explained_variance(pca) 2 eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])] ----> 3 plot_gallery(eigenfaces, eigenface_titles, h, w) Cell In[6], line 8, in plot_gallery(images, h, w, n_row, n_col) 6 for i in range(n_row * n_col): 7 plt.subplot(n_row, n_col, i + 1) ----> 8 plt.imshow(images[i].reshape((h, w)), cmap=plt.cm.gray) 9 plt.title(i, size=12) 10 plt.xticks(()) TypeError: 'list' object cannot be interpreted as an integer
Perform linear dimensionality reduction of your image data using randomized principle components analysis. Visualize the explained variance of each component. Analyze how many dimensions are required to adequately represent your image data. Explain your analysis and conclusion.
# lets do some PCA of the features and go from 1850 features to 300 features
n_components = 300
print ("Extracting the top %d eigenfaces from %d faces" % (
n_components, X.shape[0]))
rpca = PCA(n_components=n_components, svd_solver='randomized')
%time rpca.fit(X.copy())
eigenfaces = rpca.components_.reshape((n_components, h, w))
eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
plot_gallery(eigenfaces, eigenface_titles, h, w)
Exceptional Work (1 points total) One idea (required for 7000 level students): Perform feature extraction upon the images using DAISY. Rather than using matching on the images with the total DAISY vector, you will instead use key point matching. You will need to investigate appropriate methods for key point matching using DAISY. NOTE: this often requires some type of brute force matching per pair of images, which can be computationally expensive. Does it perform better than not using key point matching?